years <- c(103, 104, 105, 106)
all.df <- data.frame()
for(i in 1:4) {
message(years[i])
file.name <- paste0("./data/", years[i], "年來臺旅客消費及動向調查(原始資料).csv")
df <- read.csv(file.name, fileEncoding = "BIG5")
df <- df %>%
select(stay, freq, purp1, purp2, type, prepay, pmoney, pdollar, airf, pit1, pit2, pit3, pit4, pit5, money, dollar, money1, p1, money2, p2, money3, p3, money4, p4, money5, p5, money6, p6, m601, m602, m603, m604, m605, m606, m607, m608, m609, m610, act01, act02, act03, act04, act05, act06, act07, act08, act09, act10, act11, act12, act13, act14, act15, act16, act17, act18, act19, nation, age, income, educ, occup, gender) %>%
mutate(id=sprintf("%d%04d", years[i], as.numeric(rownames(.)))) %>%
filter(purp1 == 1 | purp2 == 1)
df$year = years[i]
all.df <- rbind(all.df, df)
}
## 103
## 104
## 105
## 106
saveRDS(all.df, "alldf.rds")
alldf <- readRDS("./alldf.rds")
dim(alldf)
## [1] 23989 65
str(alldf)
## 'data.frame': 23989 obs. of 65 variables:
## $ stay : int 12 6 6 7 7 17 7 7 5 5 ...
## $ freq : int 4 1 1 1 1 2 1 1 1 1 ...
## $ purp1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ purp2 : int 99 99 99 99 99 99 99 99 99 99 ...
## $ type : int 3 1 1 1 1 5 1 1 1 1 ...
## $ prepay : int 1 1 1 1 1 2 1 1 1 1 ...
## $ pmoney : chr "25000" "4000" "4000" "3880" ...
## $ pdollar: chr "16" "2" "2" "2" ...
## $ airf : chr "2" "1" "1" "1" ...
## $ pit1 : chr "1" "1" "1" "1" ...
## $ pit2 : chr "0" "1" "1" "1" ...
## $ pit3 : chr "0" "1" "1" "1" ...
## $ pit4 : chr "0" "1" "1" "1" ...
## $ pit5 : chr "0" "1" "1" "1" ...
## $ money : int 35000 10000 8000 1500 4500 50000 5000 7000 10000 4000 ...
## $ dollar : chr "16" "2" "2" "2" ...
## $ money1 : int 0 0 0 0 0 35000 0 0 0 0 ...
## $ p1 : chr "." "." "." "." ...
## $ money2 : int 12250 0 0 0 0 5000 0 0 0 0 ...
## $ p2 : chr "2" "." "." "." ...
## $ money3 : int 12250 0 0 0 0 7500 0 0 0 0 ...
## $ p3 : chr "2" "." "." "." ...
## $ money4 : int 7000 0 0 0 0 2500 0 0 0 0 ...
## $ p4 : chr "2" "." "." "." ...
## $ money5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ p5 : chr "." "." "." "." ...
## $ money6 : int 3500 10000 8000 1500 4500 0 5000 7000 10000 4000 ...
## $ p6 : chr "2" "1" "1" "1" ...
## $ m601 : int 0 1000 0 0 1800 0 0 1400 0 0 ...
## $ m602 : int 0 4000 0 0 0 0 0 0 0 1200 ...
## $ m603 : int 350 0 1600 0 0 0 0 350 0 800 ...
## $ m604 : int 0 0 0 0 900 0 0 1750 0 1600 ...
## $ m605 : int 3150 5000 2400 1500 1800 0 2500 3500 3000 400 ...
## $ m606 : int 0 0 1600 0 0 0 500 0 3000 0 ...
## $ m607 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ m608 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ m609 : int 0 0 2400 0 0 0 2000 0 4000 0 ...
## $ m610 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act01 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act02 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act03 : int 0 1 0 0 0 0 1 1 0 0 ...
## $ act04 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ act05 : int 0 0 0 0 0 0 1 0 0 0 ...
## $ act06 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act07 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act08 : int 1 1 1 1 1 0 1 1 1 1 ...
## $ act09 : int 0 1 1 1 1 0 1 1 1 1 ...
## $ act10 : int 1 1 1 1 1 0 0 0 1 1 ...
## $ act11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act13 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act14 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act15 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act16 : int 0 1 1 0 0 1 0 1 1 0 ...
## $ act17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act18 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ act19 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nation : int 2 2 2 2 2 1 2 2 2 2 ...
## $ age : int 3 6 6 5 6 5 7 2 3 3 ...
## $ income : int 2 8 8 8 8 7 8 2 3 2 ...
## $ educ : int 3 3 2 3 2 3 2 2 2 3 ...
## $ occup : int 7 12 12 12 12 3 12 5 8 5 ...
## $ gender : int 1 2 1 2 1 1 2 2 1 2 ...
## $ id : chr "1030001" "1030002" "1030003" "1030004" ...
## $ year : num 103 103 103 103 103 103 103 103 103 103 ...
alltb <- as.tibble(alldf)
alltb[alltb=="."] <- NA
alltb <- alltb %>%
select(id,year,everything())
# alltb
cols = 1:length(alltb)
alltb[,cols] <- apply(alltb[,cols], 2, function(x) as.numeric(as.character(x)))
cols = c()
alltb
# glimpse(alltb)
# NA & double
(PreSum <- alltb %>%
group_by(pdollar) %>%
summarise(n=n(),mean_pmoney=log(mean(pmoney)), sd=sd(pmoney)))
PreSum %>%
ggplot(aes(x=pdollar, y=mean_pmoney))+
geom_point()+
xlab("Dollar(Category)")+ylab("Mean_Pmoney(log)")+
ggtitle("Before Fixing XR")
## Warning: Removed 1 rows containing missing values (geom_point).
# nation
alltb %>%
filter(!is.na(pdollar)) %>%
group_by(pdollar) %>%
ggplot(aes(x=nation, y=log(pmoney), color=as.factor(gender)))+
geom_point()+
facet_wrap(~pdollar)+
xlab("Nation")+ylab("Pmoney(log)")+
ggtitle("NATION: Before Fixing XR")
# occup
alltb %>%
filter(!is.na(pdollar)) %>%
group_by(pdollar) %>%
ggplot(aes(x=occup, y=log(pmoney), color=as.factor(gender)))+
geom_point()+
facet_wrap(~pdollar)+
xlab("Occupation")+ylab("Pmoney(log)")+
ggtitle("Occupation: Before Fixing XR")
alltb %>%
filter(!is.na(pdollar)) %>%
filter(occup!=99) %>%
group_by(pdollar) %>%
ggplot(aes(x=occup, y=log(pmoney), color=as.factor(gender)))+
geom_point()+
facet_wrap(~pdollar)+
xlab("Occupation")+ylab("Pmoney(log)")+
ggtitle("Occup filter out 99")